Rendering HTML¶

In [ ]:
import plotly
plotly.offline.init_notebook_mode()

Framing the Problem¶

Problem is the risk analysis of patients with diabetes.

Import libraries & Load Dataset¶

In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn import datasets

# Load the diabetes dataset
data_diabetes = datasets.load_diabetes()
data_diabetes
Out[ ]:
{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990749, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06833155, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286131, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04688253,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452873, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00422151,  0.00306441]]),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
        128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
        150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
        200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
         42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
         83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
        104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
        173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
        107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
         60., 174., 259., 178., 128.,  96., 126., 288.,  88., 292.,  71.,
        197., 186.,  25.,  84.,  96., 195.,  53., 217., 172., 131., 214.,
         59.,  70., 220., 268., 152.,  47.,  74., 295., 101., 151., 127.,
        237., 225.,  81., 151., 107.,  64., 138., 185., 265., 101., 137.,
        143., 141.,  79., 292., 178.,  91., 116.,  86., 122.,  72., 129.,
        142.,  90., 158.,  39., 196., 222., 277.,  99., 196., 202., 155.,
         77., 191.,  70.,  73.,  49.,  65., 263., 248., 296., 214., 185.,
         78.,  93., 252., 150.,  77., 208.,  77., 108., 160.,  53., 220.,
        154., 259.,  90., 246., 124.,  67.,  72., 257., 262., 275., 177.,
         71.,  47., 187., 125.,  78.,  51., 258., 215., 303., 243.,  91.,
        150., 310., 153., 346.,  63.,  89.,  50.,  39., 103., 308., 116.,
        145.,  74.,  45., 115., 264.,  87., 202., 127., 182., 241.,  66.,
         94., 283.,  64., 102., 200., 265.,  94., 230., 181., 156., 233.,
         60., 219.,  80.,  68., 332., 248.,  84., 200.,  55.,  85.,  89.,
         31., 129.,  83., 275.,  65., 198., 236., 253., 124.,  44., 172.,
        114., 142., 109., 180., 144., 163., 147.,  97., 220., 190., 109.,
        191., 122., 230., 242., 248., 249., 192., 131., 237.,  78., 135.,
        244., 199., 270., 164.,  72.,  96., 306.,  91., 214.,  95., 216.,
        263., 178., 113., 200., 139., 139.,  88., 148.,  88., 243.,  71.,
         77., 109., 272.,  60.,  54., 221.,  90., 311., 281., 182., 321.,
         58., 262., 206., 233., 242., 123., 167.,  63., 197.,  71., 168.,
        140., 217., 121., 235., 245.,  40.,  52., 104., 132.,  88.,  69.,
        219.,  72., 201., 110.,  51., 277.,  63., 118.,  69., 273., 258.,
         43., 198., 242., 232., 175.,  93., 168., 275., 293., 281.,  72.,
        140., 189., 181., 209., 136., 261., 113., 131., 174., 257.,  55.,
         84.,  42., 146., 212., 233.,  91., 111., 152., 120.,  67., 310.,
         94., 183.,  66., 173.,  72.,  49.,  64.,  48., 178., 104., 132.,
        220.,  57.]),
 'frame': None,
 'DESCR': '.. _diabetes_dataset:\n\nDiabetes dataset\n----------------\n\nTen baseline variables, age, sex, body mass index, average blood\npressure, and six blood serum measurements were obtained for each of n =\n442 diabetes patients, as well as the response of interest, a\nquantitative measure of disease progression one year after baseline.\n\n**Data Set Characteristics:**\n\n  :Number of Instances: 442\n\n  :Number of Attributes: First 10 columns are numeric predictive values\n\n  :Target: Column 11 is a quantitative measure of disease progression one year after baseline\n\n  :Attribute Information:\n      - age     age in years\n      - sex\n      - bmi     body mass index\n      - bp      average blood pressure\n      - s1      tc, total serum cholesterol\n      - s2      ldl, low-density lipoproteins\n      - s3      hdl, high-density lipoproteins\n      - s4      tch, total cholesterol / HDL\n      - s5      ltg, possibly log of serum triglycerides level\n      - s6      glu, blood sugar level\n\nNote: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times the square root of `n_samples` (i.e. the sum of squares of each column totals 1).\n\nSource URL:\nhttps://www4.stat.ncsu.edu/~boos/var.select/diabetes.html\n\nFor more information see:\nBradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) "Least Angle Regression," Annals of Statistics (with discussion), 407-499.\n(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)\n',
 'feature_names': ['age',
  'sex',
  'bmi',
  'bp',
  's1',
  's2',
  's3',
  's4',
  's5',
  's6'],
 'data_filename': 'diabetes_data_raw.csv.gz',
 'target_filename': 'diabetes_target.csv.gz',
 'data_module': 'sklearn.datasets.data'}

Converting to dataframe¶

In [ ]:
df_diabetes = pd.DataFrame(data_diabetes.data,columns=data_diabetes.feature_names)
df_diabetes['target'] = data_diabetes.target
df_diabetes.head()
Out[ ]:
age sex bmi bp s1 s2 s3 s4 s5 s6 target
0 0.038076 0.050680 0.061696 0.021872 -0.044223 -0.034821 -0.043401 -0.002592 0.019907 -0.017646 151.0
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163 0.074412 -0.039493 -0.068332 -0.092204 75.0
2 0.085299 0.050680 0.044451 -0.005670 -0.045599 -0.034194 -0.032356 -0.002592 0.002861 -0.025930 141.0
3 -0.089063 -0.044642 -0.011595 -0.036656 0.012191 0.024991 -0.036038 0.034309 0.022688 -0.009362 206.0
4 0.005383 -0.044642 -0.036385 0.021872 0.003935 0.015596 0.008142 -0.002592 -0.031988 -0.046641 135.0

Exploratory Data Analysis¶

Descibe the data¶

In [ ]:
df_describe = df_diabetes.describe()
df_describe
Out[ ]:
age sex bmi bp s1 s2 s3 s4 s5 s6 target
count 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 442.000000
mean -2.511817e-19 1.230790e-17 -2.245564e-16 -4.797570e-17 -1.381499e-17 3.918434e-17 -5.777179e-18 -9.042540e-18 9.293722e-17 1.130318e-17 152.133484
std 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 77.093005
min -1.072256e-01 -4.464164e-02 -9.027530e-02 -1.123988e-01 -1.267807e-01 -1.156131e-01 -1.023071e-01 -7.639450e-02 -1.260971e-01 -1.377672e-01 25.000000
25% -3.729927e-02 -4.464164e-02 -3.422907e-02 -3.665608e-02 -3.424784e-02 -3.035840e-02 -3.511716e-02 -3.949338e-02 -3.324559e-02 -3.317903e-02 87.000000
50% 5.383060e-03 -4.464164e-02 -7.283766e-03 -5.670422e-03 -4.320866e-03 -3.819065e-03 -6.584468e-03 -2.592262e-03 -1.947171e-03 -1.077698e-03 140.500000
75% 3.807591e-02 5.068012e-02 3.124802e-02 3.564379e-02 2.835801e-02 2.984439e-02 2.931150e-02 3.430886e-02 3.243232e-02 2.791705e-02 211.500000
max 1.107267e-01 5.068012e-02 1.705552e-01 1.320436e-01 1.539137e-01 1.987880e-01 1.811791e-01 1.852344e-01 1.335973e-01 1.356118e-01 346.000000
  • Dataset contains 442 data.
  • For all the features mean close to 0 and standard deviation close to 1. But for the target variable mean is 152 & standard deviation is 77 which indicates variability.

Plot graphs for each feature and target to find the insights¶

In [ ]:
df_diabetes.hist(figsize=(12,10))
plt.show()
  • The histograms indicates that age,bmi,bp,s1,s2,s3,s4,s5,s6 are centered around mean.
  • But the target variable is rightly skewed, that means higher number of patients with lower risk of diabetes progression and there are only less number of patients with high risk of diabetes progression.

Correlation Matrix¶

In [ ]:
df_diabetes_corr = df_diabetes.corr()
plt.figure(figsize=(12,10))
sns.heatmap(df_diabetes_corr, annot=True)
plt.title('Correlation Matrix of Diabetes Dataset')
plt.show()
  • s3 shows very less correlation with other features and taget.
  • All the features except s3 shows a positive correlation with target variable.
  • BMI shows highest correlation with target among the features. That means change in BMI affects risk of diabetes progression high when comapred to other features.
  • s5 also has good enough correlation with target value.
  • So, BMI & s5 has more importance in predicting the risk of diabetes progression.

Cleaning the data¶

  • The given dataset is need not to be cleaned. Because it is mean centered to 0 and scaled by standard deviation of 1.

Split the dataset¶

In [ ]:
from sklearn.model_selection import train_test_split

# We are taking only BMI & Taget columns because it has already stated that 
# BMI is the independent variable and tage is the dependent variable.

X = df_diabetes[['bmi']]
y= df_diabetes['target']

X_train, X_old, y_train, y_old = train_test_split(X, y, test_size=0.3)

X_val,X_test,y_val,y_test = train_test_split(X_old, y_old, test_size=0.5)

print(X_train.shape, X_val.shape, X_test.shape)
(309, 1) (66, 1) (67, 1)

Polynomial Regression on BMI v/s diesease progression¶

In [ ]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

def create_poly_model(X,y,degrees):
    models = {}
    for degree in degrees:
        model = Pipeline([('polynomial', PolynomialFeatures(degree=degree)),
                          ('linear', LinearRegression())])
        model.fit(X, y)
        models[degree] = model
    return models
    

degrees = list(range(0, 6))
models = create_poly_model(X_train, y_train, degrees)

# print models
for degree, model in models.items():
    print(f'Degree: {degree}')
    print(f'Model: {model}\n')
Degree: 0
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=0)),
                ('linear', LinearRegression())])

Degree: 1
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=1)),
                ('linear', LinearRegression())])

Degree: 2
Model: Pipeline(steps=[('polynomial', PolynomialFeatures()),
                ('linear', LinearRegression())])

Degree: 3
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=3)),
                ('linear', LinearRegression())])

Degree: 4
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=4)),
                ('linear', LinearRegression())])

Degree: 5
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=5)),
                ('linear', LinearRegression())])

Comapring Models¶

Report : R-Squared,MAPE,MSE for all models¶

In [ ]:
from sklearn.metrics import r2_score, mean_absolute_error

# MAPE function
def mape(y_act, y_pred):
    return np.mean(np.abs((y_act - y_pred) / y_act)) * 100

for degree, model in models.items():
    # Predictions for train and validation set
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    
    print(f'Degree: {degree}')
    # Train Data
    print(f'    Train R2: {r2_score(y_train, y_train_pred)}')
    print(f'    Train MAE: {mean_absolute_error(y_train, y_train_pred)}')
    print(f'    Train MAPE: {mape(y_train, y_train_pred)}\n')
   
    # Validation Data
    print(f'    Val R2: {r2_score(y_val, y_val_pred)}')
    print(f'    Val MAE: {mean_absolute_error(y_val, y_val_pred)}')
    print(f'    Val MAPE: {mape(y_val, y_val_pred)}\n')
Degree: 0
    Train R2: 0.0
    Train MAE: 67.57155873943508
    Train MAPE: 63.22079557268291

    Val R2: -0.015216189178788975
    Val MAE: 53.33965872315387
    Val MAPE: 49.74814387824455

Degree: 1
    Train R2: 0.39506875105645234
    Train MAE: 49.813871472226914
    Train MAPE: 46.20225038336912

    Val R2: -0.0409139094701958
    Val MAE: 56.13393800785215
    Val MAPE: 47.957769381130454

Degree: 2
    Train R2: 0.39523244356146825
    Train MAE: 49.8067540678438
    Train MAPE: 46.17728171872351

    Val R2: -0.03633759992609775
    Val MAE: 56.000101186245885
    Val MAPE: 47.7653520590214

Degree: 3
    Train R2: 0.4015699487833455
    Train MAE: 49.12688599328037
    Train MAPE: 45.45320670325321

    Val R2: -0.050041290167614605
    Val MAE: 55.85397765617027
    Val MAPE: 47.03024581498172

Degree: 4
    Train R2: 0.40166108458559047
    Train MAE: 49.13235825623285
    Train MAPE: 45.414285542233266

    Val R2: -0.04880159279634455
    Val MAE: 55.78119348378122
    Val MAPE: 46.98637439333626

Degree: 5
    Train R2: 0.4039026126528047
    Train MAE: 49.040414808408684
    Train MAPE: 45.29869386436929

    Val R2: -0.049438128656875246
    Val MAE: 55.89220579526998
    Val MAPE: 47.18751189682845

  • Model with degree 1 is the best model.
  • It has better R-sqaured value for validation data than other models an have a comapareable value for training. So, it helps to predict the unseen data.

Conclusion¶

Run the Chosen model with test data¶

In [ ]:
from sklearn.metrics import mean_squared_error

models[1].fit(X_test,y_test)  # 5 is the degree of the polynomial model in the models dictionary
y_test_pred = models[1].predict(X_test)

# Evaluating the model with test data
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)

print('Test Data')
print(f'R2 score: {r2_test}')
print(f'MSE: {mse_test}')
print(f'MAE: {mae_test}')
Test Data
R2 score: 0.3318660414578588
MSE: 4245.949185370177
MAE: 55.29487357659975
In [ ]:
# Plotting the model
plt.figure(figsize=(10,8))
plt.scatter(X_train, y_train, color='blue', label='Train Data')
plt.scatter(X_val, y_val, color='red', label='Validation Data')
plt.scatter(X_test, y_test, color='green', label='Test Data')

plt.plot(X_test, y_test_pred, color='black', label='Degree 1 test data')
plt.plot(X_train, models[1].predict(X_train), color='yellow', label='Degree 1 train data')
plt.plot(X_val, models[1].predict(X_val), color='orange', label='Degree 1 val data')

plt.title('Polynomial Regression with model degree 1')
plt.xlabel('BMI')
plt.ylabel('Target')
plt.legend()
plt.show()

Equation of the predicted model¶

In [ ]:
def print_pipeline_model_stats(model):
    # print model    
    print(f'Model: {model}')
    print(f'Coefficients: {model[-1].coef_}')
    print(f'Intercept: {model[-1].intercept_}')
    # generate equation string:
    equation = 'y = '
    for i, coef in enumerate(model[-1].coef_):
        equation += f'{coef:.2f} * x^{i} + '
    equation += f'{model[-1].intercept_:.2f}'
    print(f'Equation: {equation}')

print_pipeline_model_stats(models[1])
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=1)),
                ('linear', LinearRegression())])
Coefficients: [  0.         923.27355022]
Intercept: 153.73795864250303
Equation: y = 0.00 * x^0 + 923.27 * x^1 + 153.74

Predict manually with bmi value = 0.05¶

In [ ]:
bmi_manual = 0.05
y_pred_manual = 923.27 * (bmi_manual)**1 +  153.74
print(y_pred_manual)

y_model = models[1].predict([[bmi_manual]])
print(y_model)
199.9035
[199.90163615]
d:\AI & ML\ML FOUNDATION\LABS\CSCN8010\venv\CSCN8010_classic_ml\Lib\site-packages\sklearn\base.py:439: UserWarning:

X does not have valid feature names, but PolynomialFeatures was fitted with feature names

  • After predicting and manual calculation we got same value for both.

Trainable Parameters for 6 models¶

In [ ]:
trainable_params = {}
for degree in range(6):
    poly = PolynomialFeatures(degree=degree)
    X_poly = poly.fit_transform(X_train)
    params_count = X_poly.shape[1] 
    trainable_params[degree] = params_count
    print(f'Degree {degree}: {poly.get_feature_names_out()}')

degrees = list(trainable_params.keys())
params = list(trainable_params.values())
print('Degrees:', degrees)
print('Trainable Parameters:', params)
Degree 0: ['1']
Degree 1: ['1' 'bmi']
Degree 2: ['1' 'bmi' 'bmi^2']
Degree 3: ['1' 'bmi' 'bmi^2' 'bmi^3']
Degree 4: ['1' 'bmi' 'bmi^2' 'bmi^3' 'bmi^4']
Degree 5: ['1' 'bmi' 'bmi^2' 'bmi^3' 'bmi^4' 'bmi^5']
Degrees: [0, 1, 2, 3, 4, 5]
Trainable Parameters: [1, 2, 3, 4, 5, 6]